via GIPHY

This tutorial will demonstrate how to use the h2o R package to combine H2O models with XGBoost models into a Stacked Ensemble.

Install XGBoost-enabled H2O

Currently, XGBoost is available in a special development edition of H2O, available (temporarily) here. Download the file, unzip it, and cd to the ./R/ directory. Install the R package: R CMD install ./R/h2o_3.11.0.99999.tar.gz

H2O ships with everything – except the system library for multithreading (openMP). If you are on a Mac, you will need to install OpenMP.

Mac

# Install OpenMP (required of xgboost-enabled h2o)
brew install gcc --without-multilib

For now, let’s assume the h2o (+xgb) R package one is installed currently.

Train Base Learners

Let’s train and cross-validate a set of H2O and XGBoost models and then create a Stacked Ensemble using the h2o R package.

Start H2O Cluster & Load Data

library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
h2o.init(nthreads = -1)
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         58 minutes 17 seconds 
##     H2O cluster version:        3.11.0.99999 
##     H2O cluster version age:    2 months and 7 days  
##     H2O cluster name:           H2O_started_from_R_me_cqy153 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.15 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     R Version:                  R version 3.3.2 (2016-10-31)
h2o.no_progress() # Don't show progress bars in RMarkdown output

# Import a sample binary outcome train/test set into H2O
train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")

# Identify predictors and response
y <- "response"
x <- setdiff(names(train), y)

# For binary classification, response should be a factor
train[,y] <- as.factor(train[,y])
test[,y] <- as.factor(test[,y])

# Number of CV folds (to generate level-one data for stacking)
nfolds <- 5

H2O base models

# Train & Cross-validate a GBM
my_gbm <- h2o.gbm(x = x,
                  y = y,
                  training_frame = train,
                  distribution = "bernoulli",
                  ntrees = 10,
                  max_depth = 3,
                  min_rows = 2,
                  learn_rate = 0.2,
                  nfolds = nfolds,
                  fold_assignment = "Modulo",
                  keep_cross_validation_predictions = TRUE,
                  seed = 1)

# Train & Cross-validate a RF
my_rf <- h2o.randomForest(x = x,
                          y = y,
                          training_frame = train,
                          ntrees = 50,
                          nfolds = nfolds,
                          fold_assignment = "Modulo",
                          keep_cross_validation_predictions = TRUE,
                          seed = 1)


# Train & Cross-validate a DNN
my_dl <- h2o.deeplearning(x = x,
                          y = y,
                          training_frame = train,
                          nfolds = nfolds,
                          fold_assignment = "Modulo",
                          keep_cross_validation_predictions = TRUE,
                          seed = 1)

XGBoost base models

# Train & Cross-validate a XGB-GBM
my_xgb1 <- h2o.xgboost(x = x,
                       y = y,
                       training_frame = train,
                       distribution = "bernoulli",
                       ntrees = 100,
                       max_depth = 3,
                       min_rows = 2,
                       learn_rate = 0.2,
                       nfolds = nfolds,
                       fold_assignment = "Modulo",
                       keep_cross_validation_predictions = TRUE,
                       seed = 1)

Create a Stacked Ensemble

To maximize predictive power, will create an H2O Stacked Ensemble from the models we created above and print the performance gain the ensemble has over the best base model.

# Train a stacked ensemble using the H2O and XGBoost models from above
base_models <- list(my_gbm@model_id, my_rf@model_id, my_dl@model_id, my_xgb1@model_id)

ensemble <- h2o.stackedEnsemble(x = x,
                                y = y,
                                training_frame = train,
                                model_id = "h2o_xgb_ensemble",
                                base_models = base_models)

# Eval ensemble performance on a test set
perf <- h2o.performance(ensemble, newdata = test)


# Compare to base learner performance on the test set
perf_gbm_test <- h2o.performance(my_gbm, newdata = test)
perf_rf_test <- h2o.performance(my_rf, newdata = test)
perf_dl_test <- h2o.performance(my_dl, newdata = test)
# TO DO: Fix this bug, right now h2o.performance does not work on xgb models
#perf_xgb1_test <- h2o.performance(my_xgb1, newdata = test) 
#Error in Filter(function(mm) { : subscript out of bounds  
baselearner_best_auc_test <- max(h2o.auc(perf_gbm_test), 
                                 h2o.auc(perf_rf_test),
                                 h2o.auc(perf_dl_test))#,
#                                 h2o.auc(perf_xgb1_test))
ensemble_auc_test <- h2o.auc(perf)
print(sprintf("Best Base-learner Test AUC:  %s", baselearner_best_auc_test))
## [1] "Best Base-learner Test AUC:  0.769803926300421"
print(sprintf("Ensemble Test AUC:  %s", ensemble_auc_test))
## [1] "Ensemble Test AUC:  0.789289348472234"